Bustos Roman, M. (2022). Steam Games Dataset [Data set]. Kaggle. https://doi.org/10.34740/KAGGLE/DS/2109585
The JSON has a nested structure, where each game’s data is in a list
that is represented by the game id (appID).
The tags feature contains value that resembles a
dictionary/table containing tags chosen by users as keys and its tag
frequency as value for each game, I will create a seperate dataframe for
this feature for further analysis.
# import raw data and combine them
# Initialize an empty dataframe to store game data
games_list <- list()
tags_list <- list()
# Read the JSON dataset
dataset <- fromJSON("games.json")
# Iterate over each appID in the dataset and store the game data within `appID` into a list
for (app in names(dataset)) {
game <- dataset[[app]]
# Collect row data
game_info <- data.frame(
appID = app,
name = coalesce(as.character(game$name), NA),
releaseDate = coalesce(as.Date(game$release_date, format = "%b %d, %Y"), NA),
estimatedOwners = coalesce(as.factor(game$estimated_owners), NA),
required_age = coalesce(as.numeric(game$required_age), 0),
price = coalesce(as.numeric(game$price), 0.0),
dlcCount = coalesce(as.numeric(game$dlc_count), 0),
languages = coalesce(paste(as.character(game$supported_languages), collapse = ", "), NA),
fullAudioLanguages = coalesce(paste(as.character(game$full_audio_languages), collapse = ", "), NA),
positive = coalesce(as.numeric(game$positive), 0),
negative = coalesce(as.numeric(game$negative), 0),
achievements = coalesce(as.numeric(game$achievements), 0),
recommendations = coalesce(as.numeric(game$recommendations), 0),
averagePlaytime = coalesce(as.numeric(game$average_playtime_forever), 0),
medianPlaytime = coalesce(as.numeric(game$median_playtime_forever), 0),
developers = coalesce(paste(as.character(game$developers), collapse = ", "), NA),
publishers = coalesce(paste(as.character(game$publishers), collapse = ", "), NA),
genres = coalesce(paste(as.character(game$genres), collapse = ", "), NA)
)
# some other excluded features
#supportWindows = coalesce(as.logical(game$windows), FALSE),
#supportMac = coalesce(as.logical(game$mac), FALSE),
#supportLinux = coalesce(as.logical(game$linux), FALSE),
#metacriticScore = coalesce(as.numeric(game$metacritic_score), 0),
#userScore = coalesce(as.numeric(game$user_score), 0),
# Append the game info to the games list
games_list[[length(games_list) + 1]] <- game_info
# Extract tags info into a separate tags dataframe
if (!is.null(game$tags) && length(names(game$tags)) > 0 && length(game$tags) > 0) {
tags_info <- data.frame(
appID = app,
tag = names(game$tags),
frequency = as.numeric(game$tags),
stringsAsFactors = FALSE
)
} else {
# Return NA for no tags
tags_info <- data.frame(
appID = app,
tag = NA,
frequency = NA,
stringsAsFactors = FALSE
)
}
# append the tags data into the tags list
tags_list[[length(tags_list) + 1]] <- tags_info
}
# combine the lists of rows into a dataframe
games_df <- do.call(rbind, games_list)
tags_df <- do.call(rbind, tags_list)
tags_df is in long format:
head(tags_df)
| appID | tag | frequency |
|---|---|---|
| 20200 | Indie | 22 |
| 20200 | Casual | 21 |
| 20200 | Sports | 21 |
| 20200 | Bowling | 6 |
| 655370 | Indie | 109 |
| 655370 | Action | 103 |
Since I have seperated the game data set into two dataframes, I will perform data cleaning and preprocessing on these dataframes seperately, and then keep only the games that remained on both dataframes before performing machine learning tasks.
games_df EDA and data cleaninggames_df <- games_df %>%
mutate(across(where(is.character), ~ gsub("^\\s+|\\s+$", "", .))) %>%
mutate(across(where(is.character), ~ iconv(., from = "UTF-8", to = "ASCII//TRANSLIT", sub = ""))) # Remove invalid or non-translatable characters
There are a total of 23 features in the games dataframe (excluding appID):
skim(games_df)
| Name | games_df |
| Number of rows | 97410 |
| Number of columns | 18 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| Date | 1 |
| factor | 1 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| appID | 0 | 1 | 2 | 7 | 0 | 97410 | 0 |
| name | 0 | 1 | 0 | 207 | 7 | 94914 | 0 |
| languages | 0 | 1 | 0 | 1010 | 4831 | 12862 | 0 |
| fullAudioLanguages | 0 | 1 | 0 | 1010 | 57017 | 2500 | 0 |
| developers | 0 | 1 | 0 | 624 | 4873 | 55234 | 0 |
| publishers | 0 | 1 | 0 | 164 | 5099 | 48395 | 0 |
| genres | 0 | 1 | 0 | 254 | 4841 | 2843 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| releaseDate | 131 | 1 | 1997-06-30 | 2025-04-14 | 2021-07-09 | 4631 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| estimatedOwners | 0 | 1 | FALSE | 14 | 0 -: 61433, 0 -: 17263, 200: 8034, 500: 3978 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| required_age | 0 | 1 | 0.28 | 2.14 | 0 | 0.00 | 0.00 | 0.00 | 21.00 | ▇▁▁▁▁ |
| price | 0 | 1 | 7.09 | 12.39 | 0 | 0.99 | 4.19 | 9.99 | 999.98 | ▇▁▁▁▁ |
| dlcCount | 0 | 1 | 0.49 | 12.83 | 0 | 0.00 | 0.00 | 0.00 | 2366.00 | ▇▁▁▁▁ |
| positive | 0 | 1 | 848.94 | 22870.11 | 0 | 0.00 | 5.00 | 35.00 | 5764420.00 | ▇▁▁▁▁ |
| negative | 0 | 1 | 141.07 | 4278.02 | 0 | 0.00 | 1.00 | 10.00 | 895978.00 | ▇▁▁▁▁ |
| achievements | 0 | 1 | 18.56 | 160.41 | 0 | 0.00 | 0.00 | 17.00 | 9821.00 | ▇▁▁▁▁ |
| recommendations | 0 | 1 | 690.51 | 16817.17 | 0 | 0.00 | 0.00 | 0.00 | 3441592.00 | ▇▁▁▁▁ |
| averagePlaytime | 0 | 1 | 91.80 | 1068.49 | 0 | 0.00 | 0.00 | 0.00 | 145727.00 | ▇▁▁▁▁ |
| medianPlaytime | 0 | 1 | 81.85 | 1412.50 | 0 | 0.00 | 0.00 | 0.00 | 208473.00 | ▇▁▁▁▁ |
The features genres, developers,
publishers, languages,
fullAudioLanguages, are all in character datatype, these
features stores different amount of character elements within a single
string in comma separated style, since the study would not focus on text
processing, I will convert these features into a count variable.
Below defines a function to count amount of comma separated elements inside a string.
# Define the function
count_values <- function(string) {
# Check if the input is not empty
if (nchar(string) == 0) {
return(0) # Return 0 if the string is empty
}
# Split the string by commas and count the elements
# Trim whitespace and count non-empty elements
counts <- strsplit(string, ", ")[[1]]
count <- length(counts) # Count non-empty trimmed elements
return(count)
}
# Apply the function for developers, publishers, genres, and categories column
games_df <- games_df %>%
mutate(developerCount = sapply(developers, count_values)) %>%
mutate(publisherCount = sapply(publishers, count_values)) %>%
mutate(genresCount = sapply(genres, count_values)) %>%
mutate(languagesCount = sapply(languages, count_values)) %>%
mutate(fullAudioLanguagesCount = sapply(fullAudioLanguages, count_values))
To ensure only historically active and non-free games are included,
games that are free or with no ratings or recommendations or
estimatedOwners = ‘0 - 20000’, or
averagePlaytime = 0, will be excluded as they are
irrelevant data points. Games with releaseDate >
‘2024-07-01’ will also be excluded, because these games are released too
recently and may not contain sufficient data for generalization.
games_df.filtered <- games_df %>% filter(recommendations > 0 & estimatedOwners != '0 - 0' & estimatedOwners != '0 - 20000' & averagePlaytime > 0 & (positive + negative) !=0 & releaseDate < "2024-07-01" & price != 0)
dim(games_df.filtered)
## [1] 8010 23
averagePlaytime&medianPlaytimeThese features contains info on how much total minutes is spent on a game by each player on average for a given game. Median means that half of all users played equal or less than this amount of minutes.
summary(games_df.filtered$averagePlaytime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 134.0 261.0 706.4 591.8 145727.0
summary(games_df.filtered$medianPlaytime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 133.2 253.0 629.3 510.8 208473.0
“For playtime median is more useful, as average gets skewed by hardcore players for paid games and people that dropped the game after tutorial for free-to-play games.”
https://galyonk.in/steam-spy-the-missing-manual-cc22ef6eebe1 (Sergiy Garlyonkin’s blog)
A traditional skewness measure cannot be calculated since the standard deviation of the distribution of total playtime for a game is not provided, however, I will use the relative difference between the mean and median as an estimated measure of skewness:
games_df.filtered <- games_df.filtered %>%
mutate(estimatedSkew = (averagePlaytime - medianPlaytime)/ averagePlaytime) %>%
mutate(playtimeRatio = averagePlaytime / medianPlaytime)
hist(games_df.filtered$estimatedSkew)
There are many games with the exact same median and average playtime,
it is very rare to see a perfectly symmetrical distribution for total
gametime minutes, however, in cases of very small sample sizes for a
given game (e.g., just one or two observations), the mean and median
playtime can easily equal to each other, resulting in zero for this
measure, but this implies that the data collected for this game is not
accurate as the player data is insufficient. I will filter out games
where estimatedSkew == 0.
games_df.filtered <- games_df.filtered %>% filter(estimatedSkew != 0)
dim(games_df.filtered)
## [1] 5930 25
hist(games_df.filtered$estimatedSkew, main = 'Distribution of estimated skew', xlab = 'estimatedSkew', breaks = 30)
plot(x = games_df.filtered$estimatedSkew, y = log(games_df.filtered$recommendations + 1), pch = 20,
cex = 0.8, xlab = "Estimated Skew", ylab = "Log(Recommendations)")
There indicates some positive relationship between estimated skew and the log-transformed recommendations amount.
Transforming minutes into hours and days.
games_df.filtered <- games_df.filtered %>%
mutate(averagePlayHours = averagePlaytime / 60) %>%
mutate(medianPlayHours = medianPlaytime / 60) %>%
mutate(averagePlayDays = averagePlayHours / 24) %>%
mutate(medianPlayDays = medianPlayHours / 24)
summary(games_df.filtered$averagePlayHours)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0333 2.7500 4.9500 13.2060 11.3833 1737.3000
summary(games_df.filtered$averagePlayDays)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00139 0.11458 0.20625 0.55025 0.47431 72.38750
hist(games_df.filtered$medianPlaytime, breaks = 50)
The medianPlaytime appears to have a highly skewed
distribution, I will perform log transformation for this feature.
games_df.filtered <- games_df.filtered %>% mutate(logMedianPlaytime = log(1 + medianPlaytime))
hist(log(games_df.filtered$logMedianPlaytime),
breaks = 40,
main = "Log Median Playtime distribution",
xlab = "Log Median Playtime")
aboveTwoHours indicatorOne of the most critical aspects of the 2-hour mark is that it is the threshold for Steam’s refund policy. Players can request a refund for games they have played for less than 2 hours, which incentivizes developers to create games that players find engaging enough to exceed this playtime. This can lead to a higher likelihood of players keeping the game if they enjoy it beyond the initial hours. I will be creating a indicator for games that have median playtime exceeding 2 hours. Source
games_df.filtered <- games_df.filtered %>%
mutate(aboveTwoHours = ifelse(medianPlaytime > 120, 1, 0))
table(games_df.filtered$aboveTwoHours)
##
## 0 1
## 938 4992
genres, publishers, and
developers variables each contains a comma separated list
of the respective attributes stored in a string. I defined a helper
function to count unique values within a string list feature of the
dataframe.
count_feature <- function(df, string_feature){
# defining a function to count unique values within a string list feature of the dataframe
# e.g. "Casual, Indie, Sports", "..."
# outputs a table of unique values within the string list and its frequency.
# Split the string by comma, resulting in a single list of lists of strings.
split_features <- strsplit(df[,string_feature], ", ") # Split on comma
# Unlist/flatten the lists into a single vector
all_features <- unlist(split_features)
# create frequency table to count how many times each unique genre appears
feature_counts <- table(all_features)
feature_counts_df <- as.data.frame(feature_counts)
colnames(feature_counts_df) <- c(string_feature, "Frequency")
feature_df <- feature_counts_df %>%
arrange(desc(Frequency))
feature_df
}
genres featureContains broad categories that define a game’s core mechanics and style of play.
head(count_feature(games_df.filtered, 'genres'),20)
| genres | Frequency |
|---|---|
| Indie | 3570 |
| Action | 2904 |
| Adventure | 2375 |
| Strategy | 1442 |
| RPG | 1382 |
| Simulation | 1361 |
| Casual | 1352 |
| Early Access | 378 |
| Racing | 228 |
| Sports | 208 |
| Massively Multiplayer | 128 |
| Utilities | 39 |
| Design & Illustration | 28 |
| Animation & Modeling | 17 |
| Violent | 16 |
| Web Publishing | 14 |
| Free to Play | 11 |
| Education | 9 |
| Gore | 9 |
| Video Production | 9 |
Although most of the game genres are related to games, some of them refers to either production software or other non-game related software.
I am removing these unrelated genres because this project will focus on playable video games and will not include non-gaming related applications, the games with non-gaming related genres also only accounts to less than 10% of all other genres.
The following code reviews the non-gaming related genres first.
# Filter games with non-game related genres
exclude_genres = c('Audio Production',
'Game Development',
'Design & Illustration',
'Utilities',
'Software Training',
'Photo Editing',
'Accounting',
'Animation & Modeling',
'Web Publishing',
'Video Production',
'Education',
'Movie',
'Short',
'360 Video',
'Documentary',
'Episodic',
'Tutorial')
# Split the genres into a list for filtering
genres_split <- strsplit(as.character(games_df.filtered$genres), ", ")
# define function that returns TRUE if any of the element within a vector contains `exclude_genres`
check_genre <- function(x){
any(x %in% exclude_genres)
}
# sapply() applies the function to each element of games_df$genres_split, returning a logical vector
exclude_rows <- sapply(genres_split, check_genre)
games_df.filtered <- games_df.filtered[!exclude_rows,]
Barplot of genre frequency
count_feature(games_df.filtered,'genres') %>%
ggplot(aes(x = reorder(genres, Frequency), y = Frequency)) +
geom_bar(stat = "identity") + # stat based on the values provided in the data frame.
coord_flip() +
labs(title = "Genre Frequency (Filtered)", x = "Genre", y = "Frequency")
age16This feature indicates whether a game’s required age is 16+.
games_df.filtered <- games_df.filtered %>%
mutate(age16 = ifelse(required_age > 16, TRUE, FALSE))
boxplot(price ~ age16 ,data = games_df.filtered)
boxplot(log(recommendations + 1) ~ age16, data = games_df.filtered, ylab = "log(recommendations)")
table(games_df.filtered$age16)
##
## FALSE TRUE
## 5350 529
It could be seen that games with age requirement of 16+ are both more recommended and higher priced. But the class is heavily imbalanced, stratified sampling method would be needed when using this predictor for training.
###developers & publishers
These are the developers who developed the game, and publishers who are responsible for distribution, marketing and sales of a game.
str(games_df.filtered$developers)
## chr [1:5879] "Pixelated Milk" "Team17 Digital Ltd" "Lonely Troops" ...
Distribution of
developersCount,publishersCount,genresCount
hist(games_df.filtered$developerCount, main = "Developers Count")
hist(games_df.filtered$publisherCount, main = "Publishers Count")
hist(games_df.filtered$genresCount, main = "Genres Count")
hist(games_df.filtered$languagesCount, main = "Languages Count")
Most of the games have a single developer and publisher, with a combination of 2-3 genres.
As self publishing a game, where the developer and the publisher are the same entity, is a very likely indicator of a game that is developed independently (Indie), identifying this characteristic may be a useful feature. I will be creating a indicator variable to indicate whether a game is self published, not including games with more than 1 developer or publisher.
games_df.filtered <- games_df.filtered %>%
mutate(selfPublished = (developerCount == 1 & publisherCount == 1) & # only in the case where they work solo
trimws(developers) == trimws(publishers)) # trim whitespace
table(games_df.filtered$selfPublished)
##
## FALSE TRUE
## 3305 2574
boxplot(price ~ selfPublished, games_df.filtered)
head(count_feature(games_df.filtered,'publishers'),20) %>%
ggplot(aes(x = reorder(publishers, Frequency), y = Frequency)) +
geom_bar(stat = "identity") + # stat based on the values provided in the data frame.
coord_flip() +
labs(title = "Publisher Frequency (Top 20)", x = "Category", y = "Frequency")
Frequency of developers and publishers in games with over 50,000
recommendations.
head(count_feature(games_df.filtered[games_df.filtered$recommendations >= 50000,],"developers"),15)
| developers | Frequency |
|---|---|
| Valve | 8 |
| Feral Interactive (Mac) | 6 |
| Ubisoft Montreal | 6 |
| Feral Interactive (Linux) | 5 |
| Ltd. | 5 |
| Aspyr (Mac) | 4 |
| Bethesda Game Studios | 4 |
| CAPCOM Co. | 4 |
| CD PROJEKT RED | 4 |
| Paradox Development Studio | 4 |
| Aspyr (Linux) | 3 |
| DICE | 3 |
| Firaxis Games | 3 |
| FromSoftware | 3 |
| Inc. | 3 |
head(count_feature(games_df.filtered[games_df.filtered$recommendations >= 50000,],"publishers"),15)
| publishers | Frequency |
|---|---|
| Ubisoft | 10 |
| Bethesda Softworks | 9 |
| Electronic Arts | 9 |
| Valve | 9 |
| Xbox Game Studios | 7 |
| 2K | 6 |
| Feral Interactive (Mac) | 6 |
| Paradox Interactive | 6 |
| Feral Interactive (Linux) | 5 |
| Square Enix | 5 |
| Aspyr (Mac) | 4 |
| CAPCOM Co. | 4 |
| CD PROJEKT RED | 4 |
| Ltd. | 4 |
| Aspyr (Linux) | 3 |
industryLeader indicatorAmong the lists of developers and publishers of games with over 50,000 recommendations, some big players in the gaming industry have showed up.
I have created a list of current industry leading game developer and
publishers based on this list with some research and domain knowledge.
This list consists of companies that are widely recognized and have more
resources in both developing and publishing a game to maximize sales. A
indicator variable will be created for whether the
developers or publishers column contains a
name from this list.
This helps the model in addressing games that may recieve a lot more recommendations simply because they were developed by industry leaders.
# Gaming leaders list
game_leaders <- c("Valve", "Bethesda Softworks","CD PROJEKT RED", "Xbox Game Studios", "2K", "CAPCOM Co.", "Bohemia Interactive","Facepunch Studios","THQ Nordic","Square Enix","Rockstar Games","Rockstar North","Rockstar Toronto", "Ubisoft", "Electronic Arts","Bethesda Game Studios", "Ubisoft Montreal","Game Science","DICE","Amazon Games","Eidos-Montreal","Snail Games USA","Klei Entertainment","Hello Games", "Gearbox Software","343 Industries","Techland","Bungie","FromSoftware","FromSoftware Inc.","BANDAI NAMCO Entertainment","SEGA","Capcom","KOEI TECMO GAMES CO", "LTD.","Konami Digital Entertainment","Warner Bros. Games", "Warner Bros. Interactive Entertainment","WB Games","Deep Silver","Activision","PlayStation PC LLC","Lucasfilm", "LucasArts", "Disney","Pearl Abyss","Tripwire Interactive")
game_leaders_regex <- paste(game_leaders, collapse = "|")
games_df.filtered$industryLeader <- grepl(game_leaders_regex, games_df.filtered$publishers, ignore.case = TRUE) |
grepl(game_leaders_regex, games_df.filtered$developers, ignore.case = TRUE)
table(games_df.filtered$industryLeader)
##
## FALSE TRUE
## 4723 1156
recommendationsThis feature contains the number of recommendations given out by players.
summary(games_df.filtered$recommendations)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 101 513 1396 8150 4313 899477
hist(games_df.filtered$recommendations)
log-transforemd recommendations:
games_df.filtered <- games_df.filtered %>%
mutate(logRecommendations = log(recommendations + 1))
hist(games_df.filtered$logRecommendations)
Games with recommendation count exceeding 100,000 (~exp(11.50)) are determined as extreme outliers within this dataset and are removed due to the limited data of such games, their inclusion could significantly impact model performance and potentially hinder accurate predictions for the majority of games with more typical recommendation counts.
games_df.filtered<- games_df.filtered %>% filter(recommendations < 100000)
hist(games_df.filtered$logRecommendations, main = "Histogram of log Recommendations", xlab = "log Recommendations", breaks = 30)
hist(games_df.filtered$recommendations, main = "Histogram of Recommendations", xlab = "Recommendations", breaks = 40)
boxplot(logRecommendations ~ aboveTwoHours, data = games_df.filtered)
boxplot(logRecommendations ~ industryLeader, games_df.filtered)
Boxplot shows games with median playtime above two hours or games from industry leaders recieve higher amount of recommendations on average.
boxplot(logRecommendations ~ languagesCount, games_df.filtered)
Games with more supported languages also were shown to recieve more recommendations.
price:hist(games_df.filtered$price, main = "Price distribution")
summary(games_df.filtered$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.35 4.99 11.41 15.07 19.99 69.99
boxplot(price ~ industryLeader, data = games_df.filtered)
boxplot(price ~ aboveTwoHours, data = games_df.filtered)
Games from industry leaders or with over two hours median playtime are on average higher priced.
priceRange0-10 = ‘Low’ 11-20 = ‘Medium’ 21 or above = ‘High’
# Define price ranges
price_breaks <- c(0, 10, 20, Inf)
# Create labels for the price ranges
price_labels <- c("Low", "Medium", "High")
# Create the categorical variable
games_df.filtered$priceRange <- cut(games_df.filtered$price, breaks = price_breaks, labels = price_labels, right = TRUE)
table(games_df.filtered$priceRange)
##
## Low Medium High
## 2867 1861 1078
boxplot(logRecommendations ~ priceRange, data=games_df.filtered)
Boxplot shows the price range of games have some positive relationships with log-recommendations.
releaseDateThis feature is the date that the game has been published/released on the Steam platform specifically, and does not reflect the earliest release date of a game on any other platform.
I will perform some feature engineering on the
releaseDate variable to obtain years since released, days
since released, released month, and released year:
# extract month from `releaseDate` and convert it to a factor with levels in chronological order
games_df.filtered <- games_df.filtered %>%
mutate(releaseMonth = month(releaseDate)) %>%
mutate(releaseYear = year(releaseDate)) %>%
mutate(yearsReleased = 2024 - releaseYear) %>% # Years since released
mutate(daysReleased = as.numeric(as.Date("2024-10-01") - releaseDate)) # days since released
# boxplots
boxplot(logRecommendations ~ yearsReleased,
data = games_df.filtered)
boxplot(price ~ yearsReleased,
data = games_df.filtered)
It is shown that price of games decreases gradually as released years
gets higher until the price reaches around 10.
plot(games_df.filtered$daysReleased, games_df.filtered$logRecommendations, pch = 20,
cex = 0.8, xlab = "Days Released", ylab = "Log Recommendations")
The recommendations count also seems to decrease slightly as days
released gets higher.
More boxplots on price:
par(mfrow = c(1,2))
boxplot(price ~ yearsReleased,
data = games_df.filtered[games_df.filtered$selfPublished == TRUE,],
main = "Self-Published")
boxplot(price ~ yearsReleased,
data = games_df.filtered[games_df.filtered$selfPublished == FALSE,],
main = "Non-Self-Published")
Self-Published games are shown to have lower price on average,
compared to other games throughout the years. Some interaction between
selfPublished and yearsReleased is observed
when looking at their effects on price.
hist(games_df.filtered$releaseYear, main = "#Games Released by year", breaks = 30)
The surge in games listed on Steam from 2013 to 2016 could be due to changes in Valve (Steam) policies that allowed developers to submit their games for community voting and approvals. After that, a rapid drop from 2016 onwards could be due to some of the game records missed during data collection.
I will remove games released on Steam from 1995 - 2008 since these older games that could introduce patterns that no longer hold true and would not be relevant for the task of modelling based on more recent dynamics of the market.
games_df.filtered <- games_df.filtered %>% filter(releaseDate > '2008-12-31')
dim(games_df.filtered)
## [1] 5596 40
positive/negativeNumber of positive or negative reviews.
I created a new variable which reflects the proportion of positive
reviews as positivePct.
games_df.filtered <- games_df.filtered %>%
mutate(positivePct = coalesce(positive/(positive + negative), 0))
hist(games_df.filtered$positivePct, main = "Positive Pct")
plot(x = games_df.filtered$positivePct, y = games_df.filtered$logRecommendations, pch = 20,
cex = 0.8, xlab = "Positive pct.", ylab = "Log Recommendations")
Most games have a high positive rating percentage, with the most amount of games with 80-90% positive reviews.
achievementsThis feature contains number of achievements/milestones available in a game. The distribution of achievement amount is right-skewed with the long tail on higher amount of achievements. I will log transform this feature as a new feature for easier interpretation.
print(summary(games_df.filtered$achievements))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 10.00 25.00 50.71 46.00 5000.00
hist(games_df.filtered$achievements, breaks = 30)
games_df.filtered <- games_df.filtered %>% mutate(logAchievements = log(1 + achievements))
hist(games_df.filtered$logAchievements, breaks = 30)
It seems that most games have zero achievements (log(1)), but many also have extremely large number of achievements
boxplot(logAchievements ~ aboveTwoHours, data = games_df.filtered)
boxplot(logAchievements ~ priceRange, data = games_df.filtered)
It is shown that games with median playtime over two hours have at least some achievements in the game, and games with higher price have more achievements on average.
dlcCountThis feature contains number of downloadable contents “DLC” available in a game.
hist(games_df.filtered$dlcCount, breaks = 100)
I will cap the dlcCount to 30, which is a suitable cap
for most games, games with over 30 DLCs usually have a very specific
design which is to update and expand gaming contents constantly, there
are only 56 games with over 30 DLCs.
head(games_df.filtered[games_df.filtered$dlcCount > 30,][1:6])
| appID | name | releaseDate | estimatedOwners | required_age | price | |
|---|---|---|---|---|---|---|
| 14 | 730310 | DYNASTY WARRIORS 9 | 2018-02-13 | 1000000 - 2000000 | 0 | 44.99 |
| 76 | 270880 | American Truck Simulator | 2016-02-02 | 2000000 - 5000000 | 0 | 19.99 |
| 315 | 24010 | Train Simulator Classic | 2022-04-21 | 1000000 - 2000000 | 0 | 24.99 |
| 405 | 744060 | Groove Coaster | 2018-07-16 | 100000 - 200000 | 0 | 19.99 |
| 480 | 678950 | DRAGON BALL FighterZ | 2018-01-26 | 2000000 - 5000000 | 0 | 59.99 |
| 512 | 45760 | Ultra Street FighterR IV | 2014-08-07 | 500000 - 1000000 | 0 | 29.99 |
games_df.filtered <- games_df.filtered %>%
mutate(dlcCount = ifelse(dlcCount > 30, 30, dlcCount))
hist(games_df.filtered$dlcCount, breaks = 100, main = "Histogram of DLC Count")
As the distribution is heavily skewed, I will log-transform the
dlcCount feature as a new feature for interpretation.
games_df.filtered <- games_df.filtered %>% mutate(logDlcCount = log(1 + dlcCount))
hist(games_df.filtered$logDlcCount, main = "Histogram of log DLC count", xlab="log DLC count", breaks = 30)
The log transformed values for dlcCount have a
exponential distribution.
boxplot(logRecommendations ~ dlcCount, data = games_df.filtered)
It is shown that games with more dlc count leads to higher recommendations, there is more fluctuations in recommendations count for games with over 13 DLCs as there is not much data for this type of games, but their recommendation count remains on the relatively high range on average regardless.
skim(games_df.filtered)
| Name | games_df.filtered |
| Number of rows | 5596 |
| Number of columns | 43 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| Date | 1 |
| factor | 2 |
| logical | 3 |
| numeric | 30 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| appID | 0 | 1 | 3 | 7 | 0 | 5596 | 0 |
| name | 0 | 1 | 1 | 81 | 0 | 5576 | 0 |
| languages | 0 | 1 | 7 | 330 | 0 | 2525 | 0 |
| fullAudioLanguages | 0 | 1 | 0 | 319 | 2352 | 659 | 0 |
| developers | 0 | 1 | 2 | 142 | 0 | 3612 | 0 |
| publishers | 0 | 1 | 0 | 80 | 28 | 2446 | 0 |
| genres | 0 | 1 | 0 | 93 | 8 | 402 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| releaseDate | 0 | 1 | 2009-01-07 | 2024-05-16 | 2017-02-02 | 2496 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| estimatedOwners | 0 | 1 | FALSE | 9 | 200: 1329, 100: 1223, 500: 1108, 200: 742 |
| priceRange | 0 | 1 | FALSE | 3 | Low: 2699, Med: 1823, Hig: 1074 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| age16 | 0 | 1 | 0.08 | FAL: 5149, TRU: 447 |
| selfPublished | 0 | 1 | 0.44 | FAL: 3121, TRU: 2475 |
| industryLeader | 0 | 1 | 0.18 | FAL: 4604, TRU: 992 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| required_age | 0 | 1 | 1.62 | 4.93 | 0.00 | 0.00 | 0.00 | 0.00 | 18.00 | ▇▁▁▁▁ |
| price | 0 | 1 | 15.27 | 12.68 | 0.35 | 4.99 | 11.99 | 19.99 | 69.99 | ▇▅▂▁▁ |
| dlcCount | 0 | 1 | 1.75 | 4.30 | 0.00 | 0.00 | 0.00 | 1.00 | 30.00 | ▇▁▁▁▁ |
| positive | 0 | 1 | 5769.60 | 13183.27 | 44.00 | 545.00 | 1390.00 | 4299.50 | 119516.00 | ▇▁▁▁▁ |
| negative | 0 | 1 | 923.26 | 2308.34 | 3.00 | 109.00 | 259.00 | 704.00 | 41287.00 | ▇▁▁▁▁ |
| achievements | 0 | 1 | 50.71 | 243.69 | 0.00 | 10.00 | 25.00 | 46.00 | 5000.00 | ▇▁▁▁▁ |
| recommendations | 0 | 1 | 5278.38 | 11765.46 | 101.00 | 497.75 | 1355.50 | 4038.00 | 99084.00 | ▇▁▁▁▁ |
| averagePlaytime | 0 | 1 | 674.97 | 1481.90 | 2.00 | 165.00 | 291.00 | 651.00 | 51388.00 | ▇▁▁▁▁ |
| medianPlaytime | 0 | 1 | 587.67 | 1982.25 | 3.00 | 168.00 | 278.00 | 553.25 | 102435.00 | ▇▁▁▁▁ |
| developerCount | 0 | 1 | 1.17 | 0.51 | 1.00 | 1.00 | 1.00 | 1.00 | 10.00 | ▇▁▁▁▁ |
| publisherCount | 0 | 1 | 1.13 | 0.41 | 0.00 | 1.00 | 1.00 | 1.00 | 5.00 | ▇▁▁▁▁ |
| genresCount | 0 | 1 | 2.66 | 1.28 | 0.00 | 2.00 | 2.50 | 3.00 | 9.00 | ▂▇▂▁▁ |
| languagesCount | 0 | 1 | 6.57 | 5.51 | 1.00 | 1.00 | 6.00 | 10.00 | 29.00 | ▇▅▁▁▁ |
| fullAudioLanguagesCount | 0 | 1 | 1.96 | 3.63 | 0.00 | 0.00 | 1.00 | 2.00 | 29.00 | ▇▁▁▁▁ |
| estimatedSkew | 0 | 1 | -0.05 | 0.43 | -1.00 | -0.33 | -0.09 | 0.25 | 0.99 | ▂▆▇▅▂ |
| playtimeRatio | 0 | 1 | 1.46 | 3.36 | 0.50 | 0.75 | 0.92 | 1.34 | 146.20 | ▇▁▁▁▁ |
| averagePlayHours | 0 | 1 | 11.25 | 24.70 | 0.03 | 2.75 | 4.85 | 10.85 | 856.47 | ▇▁▁▁▁ |
| medianPlayHours | 0 | 1 | 9.79 | 33.04 | 0.05 | 2.80 | 4.63 | 9.22 | 1707.25 | ▇▁▁▁▁ |
| averagePlayDays | 0 | 1 | 0.47 | 1.03 | 0.00 | 0.11 | 0.20 | 0.45 | 35.69 | ▇▁▁▁▁ |
| medianPlayDays | 0 | 1 | 0.41 | 1.38 | 0.00 | 0.12 | 0.19 | 0.38 | 71.14 | ▇▁▁▁▁ |
| logMedianPlaytime | 0 | 1 | 5.71 | 1.06 | 1.39 | 5.13 | 5.63 | 6.32 | 11.54 | ▁▆▇▁▁ |
| aboveTwoHours | 0 | 1 | 0.85 | 0.36 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▂▁▁▁▇ |
| logRecommendations | 0 | 1 | 7.33 | 1.51 | 4.62 | 6.21 | 7.21 | 8.30 | 11.50 | ▅▇▆▃▁ |
| releaseMonth | 0 | 1 | 6.65 | 3.34 | 1.00 | 4.00 | 7.00 | 10.00 | 12.00 | ▇▆▅▆▇ |
| releaseYear | 0 | 1 | 2016.65 | 3.33 | 2009.00 | 2014.00 | 2017.00 | 2019.00 | 2024.00 | ▃▇▇▆▂ |
| yearsReleased | 0 | 1 | 7.35 | 3.33 | 0.00 | 5.00 | 7.00 | 10.00 | 15.00 | ▃▆▇▅▂ |
| daysReleased | 0 | 1 | 2771.54 | 1213.54 | 138.00 | 1813.00 | 2798.00 | 3609.00 | 5746.00 | ▃▆▇▅▂ |
| positivePct | 0 | 1 | 0.81 | 0.13 | 0.15 | 0.75 | 0.84 | 0.91 | 0.99 | ▁▁▁▅▇ |
| logAchievements | 0 | 1 | 2.79 | 1.60 | 0.00 | 2.40 | 3.26 | 3.85 | 8.52 | ▅▆▇▁▁ |
| logDlcCount | 0 | 1 | 0.57 | 0.77 | 0.00 | 0.00 | 0.00 | 0.69 | 3.43 | ▇▅▂▁▁ |
numeric_df <- games_df.filtered[,c('logRecommendations','recommendations','price','estimatedSkew','positivePct','dlcCount','medianPlaytime','logMedianPlaytime','daysReleased','languagesCount','aboveTwoHours','selfPublished','age16','industryLeader')]
# Calculate correlation matrix
corr_matrix <- cor(numeric_df)
corrplot(corr_matrix, method = "color", type = "upper",
addCoef.col = "black", # Add correlation coefficients in black color
tl.col = "black",tl.srt = 45,tl.cex = 0.8, # Text label color and rotation
number.cex = 0.55) # Adjust size of the numbers
Predictors Selection for recommendations as the
regression target variable:
It could be seen that estimatedSkew, price,
positivePct, languagesCount,
dlcCount, age16 and
industryLeader has a correlation above 0.2 with the log
target variable, also, log transformed medianPlaytime have
a higher correlation value with the log target variable compared to its
original value, also above 0.2.
chosen features: estimatedSkew, price,
positivePct, languagesCount,
age16, industryLeader, dlcCount,
logMedianPlaytime,
Predictors Selection for priceRange as the
classification target variable:
dlcCount, daysReleased,
languagesCount, age16,
industryLeader, selfPublished and
aboveTwoHours have relatively strong correlation with the
target, also the log transformed medianPlaytime and
recommendations have a much higher correlation than its
original value. Variables with weaker correlations are included for this
problem because the models chosen are more flexible and robust to
linearity assumptions.
chosen features: dlcCount, daysReleased,
languagesCount, age16,
industryLeader, selfPublished,
aboveTowHours, logMedianPlaytime,
logRecommendations
Scatter plot for regression features
ggpairs(games_df.filtered[, c("logRecommendations", "estimatedSkew", "price",
"positivePct", "languagesCount",
"logMedianPlaytime", "dlcCount","age16", "industryLeader")],
lower = list(continuous = wrap("points", size = 0.5)),
progress = FALSE
) +
theme(
strip.text.y = element_text(angle = 0, size = 8.5), # Make strip labels horizontal
strip.text.x = element_text(size = 8.5)
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For the Unsupervised learning part of this coursework, I will be using the user-generated tags frequency for each game to find clusters of tags that are similar.
Hierarchical clustering can be used to find common groups within different tags. The euclidean distance between tags based on their usage and frequency across different games can be used to find tags that are “close” to each other.
# Transpose and Pivot to wide format
tags_games <- pivot_wider(top_tags,
id_cols = tag, # row
names_from = appID, # col
values_from = frequency) # value
tags_games[is.na(tags_games)] <- 0 # fill na to zero
colnames(tags_games) <- make.names(colnames(tags_games)) # make sure names comply with R's variable naming scheme
head(tags_games[1:6])
| tag | X219990 | X392110 | X370910 | X231430 | X252130 |
|---|---|---|---|---|---|
| Action.RPG | 34590 | 0 | 0 | 0 | 0 |
| Strategy | 0 | 30531 | 0 | 8346 | 58 |
| Point…Click | 0 | 0 | 13760 | 0 | 0 |
| Puzzle | 0 | 0 | 54 | 0 | 7706 |
| Hack.and.Slash | 7225 | 0 | 0 | 0 | 0 |
| Dark.Fantasy | 7212 | 0 | 0 | 0 | 0 |
Format as matrix form and get distance matrix
tags_games.labs <- tags_games$tag
tags_games.data <- tags_games[,-1]# exclude first col
# scale the frequency
tags_games.scaled <- scale(tags_games.data)
# calculate distances
tags_games.dist <- dist(tags_games.scaled)
Hierarchical clustering using distance matrix
plot(hclust(tags_games.dist),
labels = tags_games.labs,
main = "Euclidean-distance Clustering",
xlab = "",
ylab = "",
cex = 0.2)
plot(hclust(tags_games.dist, method = "average"),
labels = tags_games.labs,
main = "Average Linkage",
xlab = "",
ylab = "",
cex = 0.3)
Look at cluster assignments when K = 7.
tags_games.dist_hclust <- hclust(dist(tags_games.scaled))
dist_hclust.clusters <- cutree(tags_games.dist_hclust, 10)
data.frame(table(dist_hclust.clusters))
| dist_hclust.clusters | Freq |
|---|---|
| 1 | 421 |
| 2 | 1 |
| 3 | 1 |
| 4 | 1 |
| 5 | 1 |
| 6 | 1 |
| 7 | 1 |
| 8 | 1 |
| 9 | 4 |
| 10 | 1 |
Look at some tags in cluster 1 and 9
dist_clusters <- data.frame(tag = tags_games.labs, cluster = dist_hclust.clusters)
head(dist_clusters[dist_clusters['cluster'] == 1],10)
## [1] "Action.RPG" "Point...Click" "Hack.and.Slash"
## [4] "Dark.Fantasy" "Metroidvania" "Early.Access"
## [7] "FPS" "Real.Time.Tactics" "Space.Sim"
## [10] "X4X"
head(dist_clusters[dist_clusters['cluster'] == 9],10)
## [1] "Multiplayer" "Singleplayer" "Co.op" "Online.Co.Op" " 9"
## [6] " 9" " 9" " 9"
When the tags are seperated into 10 clusters, it could be seen that most of the tags are grouped in cluster 1, while cluster 9 for example, contains popular tags like “Multiplayer”, “Online Co-op” and “Co-op”, this could be due to the fact that the euclidean distance takes into account the raw frequency of the tags voted for each game, since each game only consists of 10-20 tags, the distribution of tag frequency for each game would be zero-inflated, this zero-inflation can lead to many tags having low or no frequencies across games, which is most likely why they have been grouped together, and since user-generated Tag frequencies are heavily influenced by game popularity, a popular game will naturally have higher frequencies of tags, this introduces tag outliers, where tags that ties with very popular games would be grouped by itself because they have longer euclidean distance to any other tags.
I will then try a correlation-based distance to measure dissimilarity between tags, which focuses on relationships between tags rather than their raw frequency/magnitude.
# corr matrix
tags_games.corr <- cor(t(tags_games.scaled))
# 1 - correlation to convert to correlation-based dissimilarity matrix
tags_games.corr_dist <- as.dist(1 - tags_games.corr)
plot(hclust(tags_games.corr_dist),
labels = tags_games.labs,
main = "Complete Linkage",
xlab = "",
ylab = "",
cex = 0.3)
plot(hclust(tags_games.corr_dist, method = "average"),
labels = tags_games.labs,
main = "Average Linkage",
xlab = "",
ylab = "",
cex = 0.3)
plot(hclust(tags_games.corr_dist),
labels = tags_games.labs,
main = "Correlation-based distance Clustering",
xlab = "",
ylab = "",
cex = 0.2)
rect.hclust(hclust(tags_games.corr_dist), k = 7)
Look at cluster assignments frequency when K = 7.
corr_hc <- hclust(tags_games.corr_dist)
corr_hc.clusters <- cutree(corr_hc, k = 7)
tag_cluster <- data.frame(tag = tags_games.labs, cluster = corr_hc.clusters)%>%
arrange(cluster)
data.frame(table(corr_hc.clusters))
| corr_hc.clusters | Freq |
|---|---|
| 1 | 86 |
| 2 | 39 |
| 3 | 59 |
| 4 | 173 |
| 5 | 19 |
| 6 | 30 |
| 7 | 27 |
Look at tag assignments from correlation matrix of raw tag frequencies
# print out first 10 tags in each cluster
for (i in 1:length(unique(tag_cluster$cluster))) {
cat('\ncluster',i,': [',head(tag_cluster[tag_cluster['cluster']==i],20),
',...]')
}
##
## cluster 1 : [ Action.RPG Hack.and.Slash Dark.Fantasy X4X PvE Loot Medieval Hacking Bullet.Hell Dungeon.Crawler Rogue.lite Souls.like Rogue.like Immersive.Sim Replay.Value Action.Roguelike X3D Dog Deckbuilding Card.Battler ,...]
## cluster 2 : [ Strategy Real.Time.Tactics RTS Grand.Strategy Wargame Real.Time Racing City.Builder Simulation Colony.Sim Life.Sim Sandbox Physics Realistic Automobile.Sim VR Building Historical Driving Transportation ,...]
## cluster 3 : [ Point...Click Adventure Metroidvania Horror Great.Soundtrack Story.Rich Walking.Simulator Pixel.Graphics Choices.Matter Difficult Singleplayer X2D Multiple.Endings Drama Cinematic Roguevania Platformer Dark Relaxing Exploration ,...]
## cluster 4 : [ Puzzle MMORPG Indie Casual Artificial.Intelligence Sailing Sports World.War.I Tanks Surreal Psychedelic Jet Steampunk Parkour Logic Short Beautiful Puzzle.Platformer Comic.Book World.War.II ,...]
## cluster 5 : [ Early.Access FPS Space.Sim PvP Open.World Survival Open.World.Survival.Craft Multiplayer Space First.Person Team.Based Co.op Shooter Sci.fi Science Online.Co.Op Crafting Voxel Mars 5 ,...]
## cluster 6 : [ RPG Sexual.Content Nudity Hentai Mature Turn.Based.Tactics Magic LGBTQ. Romance Female.Protagonist Fantasy Cute Turn.Based.Combat JRPG Character.Customization Turn.Based.Strategy Anime Turn.Based NSFW Strategy.RPG ,...]
## cluster 7 : [ Comedy Action Funny Zombies Stealth Gore Arena.Shooter Combat Assassin Violent Post.apocalyptic Dark.Humor Action.Adventure Third.Person Looter.Shooter Third.Person.Shooter Blood Fast.Paced Inventory.Management Satire ,...]
Based on the dendrogram of clustering using complete linkage method, it could be seen that there are 7 major clusters based on a correlation-based distance between tags. It is also shown that each of the clusters have tags that are more similar to each other compared to euclidean-based distance. For example, strategic shooting games-related tags are all in the same cluster, and singleplayer or roleplaying games-related tags are in another cluster, and so on.
The results shows that using correlation-based distances can help mitigate some issues associated with zero-inflation, as it focuses on the relative patterns of usage rather than absolute counts.
There is one cluster that contains a lot more tags than any other clusters, this cluster is formed mostly by merging a lot of other smaller clusters during the clustering process, and have formed a single “branch” of its own, with many inner clusters merged sequentially, this could indicate that the tags within this cluster are heterogeneous, and that it contains a diverse mix of tags with weak or no meaningful relationships between them. Within this cluster, some commonly used tags like “MMORPG” and “Casual” are present, while the remaining tags within the cluster seems to be too specific or unrelated to each other for the algorithm to group them meaningfully, which could be the reason why they are all grouped together in this one cluster.
Based on previous results in hierarchical clustering, it has been shown that correlation-based dissimilarity distances are useful for grouping tags into clusters. To further improve the quality of clusters, I will try reducing dimensionality of the tags data before performing clustering to minimize noise and redundant information. In this context, Principal Component Analysis (PCA) can be utilized to identify linear combinations of different tags that maximize the variance in the data.
PCA examines the relationships between tags based on their frequency across all games. Tags that frequently appear together tend to have strong loadings on the same principal components, indicating that they contribute similarly to the overall variance. Moreover, hierarchical clustering can then be applied to the principal component scores, using the distances between these projected loading scores to derive more refined clusters.
tags.wide <- pivot_wider(top_tags,
id_cols = appID, # row
names_from = tag, # col
values_from = frequency) # value
tags.wide[is.na(tags.wide)] <- 0 # fill na to zero
colnames(tags.wide) <- make.names(colnames(tags.wide)) # this make sure names comply with R's variable naming scheme
head(tags.wide[1:6])
| appID | Action.RPG | Strategy | Point…Click | Puzzle | Hack.and.Slash |
|---|---|---|---|---|---|
| 219990 | 34590 | 0 | 0 | 0 | 7225 |
| 392110 | 0 | 30531 | 0 | 0 | 0 |
| 370910 | 0 | 0 | 13760 | 54 | 0 |
| 231430 | 0 | 8346 | 0 | 0 | 0 |
| 252130 | 0 | 58 | 0 | 7706 | 0 |
| 285800 | 0 | 7582 | 0 | 0 | 0 |
dim(tags.wide)
## [1] 5596 434
Perform PCA
# scale first
tags_scaled <- scale(tags.wide[,-1])
pca_result <- prcomp(t(tags_scaled))
Choose number of components
explained_variance <- summary(pca_result)$importance[2, ] # Proportion of variance explained
cumulative_variance <- cumsum(explained_variance) # Cumulative variance explained
num_components <- which(cumulative_variance >= 0.9)[1] # Retain components to explain 90% of variance
plot(cumulative_variance,
xlab = 'Principal Components',
ylab = "Cumulative PVE",
main = "PCA Screeplot")
# vertical line at num_components
abline(v = num_components, col = "red", lwd = 2, lty = 2)
# text next to the vertical line
text(num_components + 0.5, 0.85,
labels = paste("90% PVE at PC", num_components),
col = "red", pos = 4)
Project tags data onto PCA space
# project the tag frequencies onto the PC space
pca_scores <- as.data.frame(pca_result$x)
# Subset the PC up to 90% PVE
pca_scores_subset <- pca_scores[, 1:num_components]
head(pca_scores_subset[1:6])
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | |
|---|---|---|---|---|---|---|
| Action.RPG | -10.802061 | 8.575841 | -16.570341 | 1.199756 | -18.2563128 | 18.818229 |
| Strategy | 18.571310 | -1.686384 | -5.990351 | -5.520766 | -9.2062502 | -4.053358 |
| Point…Click | -3.956126 | 1.630471 | 5.245800 | -3.629615 | 0.9030723 | 6.578221 |
| Puzzle | -12.648316 | -11.923013 | 11.670569 | -8.818684 | 0.0965081 | -9.752322 |
| Hack.and.Slash | -15.188342 | 12.078385 | -21.529573 | 2.414603 | -21.9329717 | 10.655177 |
| Dark.Fantasy | -12.971287 | 10.946296 | -19.037150 | 1.014465 | -25.2052921 | 16.420057 |
Calculate Correlation-based distance matrix of tags based on PCs
# calculate the correlation matrix
pca_cor_matrix <- cor(t(pca_scores_subset))
# Convert the correlation matrix to a dissimilarity distance matrix
pca_cor_distance <- as.dist(1 - pca_cor_matrix)
# perform hierarchical clustering
pca_hclust <- hclust(pca_cor_distance, method = "complete")
Examining cluster assignments
Plotting dendrogram with colored branches and group number, used modified code from: https://stackoverflow.com/questions/48636522/label-r-dendrogram-branches-with-correct-group-number
pca_clusters <- cutree(pca_hclust, k = 7)
# Create dendrogram
dend <- as.dendrogram(pca_hclust)
colors = RColorBrewer::brewer.pal(7, "Dark2")
# below code is from: https://stackoverflow.com/questions/57674446/how-to-associate-cluster-labels-and-dendrogram-in-the-same-order-on-a-plot?noredirect=1&lq=1
dend.cutree <- dendextend:::cutree(pca_hclust, k=7, order_clusters_as_data = FALSE)
idx <- order(names(dend.cutree))
dend.cutree <- dend.cutree[idx]
df.merge <- merge(pca_clusters,dend.cutree,by='row.names')
df.merge.sorted <- df.merge[order(df.merge$y),]
lbls<-unique(df.merge.sorted$x)
dend1 <- color_branches(pca_hclust, k = 7, groupLabels = lbls, col = colors[lbls])
plot(dend1, leaflab = 'none', main = "Tag Clustering w/ PC score correlation-distance")
Around 7 major clusters could be determined after cutting the
dendrogram.
pca_clusters <- cutree(pca_hclust, k = 7)
print(data.frame(table(pca_clusters)))
## pca_clusters Freq
## 1 1 17
## 2 2 23
## 3 3 231
## 4 4 19
## 5 5 80
## 6 6 34
## 7 7 29
PC variance and PCs plot with cluster assignments
# calculate variance explained
pca.var <- pca_result$sdev^2
pca.var.per <- round(pca.var/sum(pca.var), 3)
pca.data <- data.frame(tag = rownames(pca_result$x),
X = pca_result$x[,1],
Y = pca_result$x[,2],
Z = pca_result$x[,3])
pca.data$cluster <- factor(pca_clusters)
ggplot(data = pca.data,
aes(x = X, y = Y, label = tag, color = cluster)) +
geom_text(size = 3) +
xlab(paste("PC1, PVE: ", pca.var.per[1], sep = "")) +
ylab(paste("PC2, PVE: ", pca.var.per[2], sep = "")) +
ggtitle("PCA plot of first two principal components") +
scale_color_brewer(palette = "Dark2")
3D plot for first 3 PCs
# Open a new 3D plotting window and create an empty plot (type = "n")
plot3d(pca.data$X, pca.data$Y, pca.data$Z,
type = "n", # Do not plot points immediately
xlab = paste("PC1, PVE: ", pca.var.per[1], sep = ""),
ylab = paste("PC2, PVE: ", pca.var.per[2], sep = ""),
zlab = paste("PC3, PVE: ", round(pca.var.per[3], 3), sep = ""))
# Add text labels at each point's coordinates.
# Here, 'texts' are taken from the 'tag' column and colored according to the cluster.
text3d(pca.data$X, pca.data$Y, pca.data$Z,
texts = pca.data$tag,
col = pca.data$cluster,
cex = 0.7) # Adjust 'cex' to control text size
Tag clustering from correlation matrix of pc scores of each tag (first 10 tags of each cluster).
tag_pc_cluster <- data.frame(tag = names(pca_clusters),
cluster = pca_clusters)
# print out first 10 tags in each cluster
for (i in 1:length(unique(tag_pc_cluster$cluster))) {
cat('\ncluster',i,': [',head(tag_pc_cluster[tag_pc_cluster['cluster']==i],10),
',...]')
}
##
## cluster 1 : [ Action.RPG Hack.and.Slash Dark.Fantasy RPG Loot Magic Dungeon.Crawler Fantasy Dark Character.Customization ,...]
## cluster 2 : [ Strategy Real.Time.Tactics Space.Sim X4X RTS Grand.Strategy Wargame PvP PvE Real.Time ,...]
## cluster 3 : [ Point...Click Puzzle Metroidvania Racing Medieval Great.Soundtrack Bullet.Hell Female.Protagonist Platformer Zombies ,...]
## cluster 4 : [ Adventure FPS Multiplayer Story.Rich Singleplayer First.Person Action Co.op Dog Atmospheric ,...]
## cluster 5 : [ Early.Access Sexual.Content MMORPG Nudity Hentai Mature City.Builder Turn.Based.Tactics Walking.Simulator Hacking ,...]
## cluster 6 : [ Horror Indie Rogue.lite Souls.like Pixel.Graphics Rogue.like Difficult X2D Replay.Value Action.Roguelike ,...]
## cluster 7 : [ Casual Life.Sim LGBTQ. Choices.Matter Comedy Immersive.Sim Multiple.Endings Funny Romance Drama ,...]
Overall, the resulting cluster assignments slightly matched those from clustering the raw tag frequencies directly, but due to concerns with zero-inflation, which might distort principal components, and the fact that there is not much reduction in data dimensionality (from 433 to 252 features), I have chose to use the clustering results from the original correlation matrix of raw tag frequencies.
For the regression task, I will use the Negative Binomial Regression
model to predict recommendations, since this is a count
variable with variance a lot higher than the mean, a Negative Binomial
Regression model is suitable because it is specifically designed to
handle overdispersion of a count variable.
The initial selected predictors for the models will
be:estimatedSkew, price,
positivePct, languagesCount,
age16, industryLeader,
logMedianPlaytime, dlcCount,
clusterCount, and the cluster weights. Since there are a
total of 17 features, a subset selection process with be implemented to
determine a smaller subset of features or coefficients that best
optimizes the predictive performance of the model and to simplify the
model.
The data is split into (8:2) training and testing partitions: - The training partition would be used to estimate a generalized test error for different models. - The testing partition would be used to evaluate the final test error of the selected model.
Three models would be considered for regression task, namely, the
Negative Binomial Regression, random forest,
and xgboost. The Negative binomial regression model assumes
linear relationships between the log-transformed target variable and the
independent features, which is why I have chosen features that are most
correlated with the log-transformed target variable,
medianPlaytime are also log-transformed because it were
shown to produce stronger correlation to the log-transformed target.
# features used for both GLM and tree-based models
regr.features <- left_join(games_df.filtered[,c('recommendations','appID','estimatedSkew', 'price', 'positivePct', 'languagesCount', 'age16', 'industryLeader', 'logMedianPlaytime', 'dlcCount')],
clusterWeight.wide,
by = 'appID')
regr.features <- left_join(regr.features, cluster.count, by = 'appID') %>% dplyr::select(-appID)
# define task to store all features
regr_task = as_task_regr(regr.features,
target = "recommendations",
id = "regr task all")
# specify that the feature "age16" and "industryLeader" is stratified during split
regr_task$set_col_roles("age16", c("feature","stratum"))
regr_task$set_col_roles("industryLeader", c("feature","stratum"))
# Set seed for reproducibility
set.seed(1)
# split train test partition
regr_splits <- partition(regr_task, ratio = 0.8)
print(regr_task)
## <TaskRegr:regr task all> (5596 x 17)
## * Target: recommendations
## * Properties: strata
## * Features (16):
## - dbl (14): X1, X2, X3, X4, X5, X6, X7, clusterCount, dlcCount,
## estimatedSkew, languagesCount, logMedianPlaytime, positivePct,
## price
## - lgl (2): age16, industryLeader
## * Strata: age16, industryLeader
This model would be used as a baseline for comparison of other models.
# load featureless learner
regr_featureless = lrn("regr.featureless")
regr_featureless$train(regr_task, regr_splits$train)
prediction_bas = regr_featureless$predict(regr_task, regr_splits$test)
prediction_bas$score(msrs(c("regr.mse","regr.rmse","regr.rsq","regr.mae")))
## regr.mse regr.rmse rsq regr.mae
## 1.139723e+08 1.067578e+04 -3.297003e-03 5.964871e+03
tuning and training XGBoost model with training data.
xgboost_lrn_tuned <- lrn("regr.xgboost")
xgboost_lrn_tuned$param_set$values = xgboost_tune_instance$result_learner_param_vals
xgboost_lrn_tuned$train(regr_task.train)
Feature importance based on contribution of each feature to the model’s objective function.
# Extract importance and convert to data.frame
xgb_regr_importance <- data.frame(
feature = names(xgboost_lrn_tuned$importance()),
importance = unname(xgboost_lrn_tuned$importance())
)
# Sort features by importance (descending)
xgb_regr_importance <- xgb_regr_importance %>%
arrange(desc(importance))
print(xgb_regr_importance)
## feature importance
## 1 estimatedSkew 0.351427986
## 2 logMedianPlaytime 0.176837812
## 3 positivePct 0.115743044
## 4 languagesCount 0.077117969
## 5 X5 0.062503218
## 6 X3 0.051692464
## 7 dlcCount 0.026793850
## 8 price 0.024226841
## 9 X7 0.020418356
## 10 X4 0.019165813
## 11 X6 0.017165894
## 12 X2 0.017083508
## 13 X1 0.016268271
## 14 clusterCount 0.009101299
## 15 age16 0.008550836
## 16 industryLeader 0.005902840
ggplot(xgb_regr_importance, aes(x = reorder(feature, importance), y = importance)) +
geom_col(fill = "steelblue") + # Bar color
coord_flip() + # Horizontal bars for readability
labs(
title = "Feature Importance in XGBoost Model",
x = "Feature",
y = "Importance Score"
) +
theme_minimal()
regr_rpart_lrn <- lrn("regr.rpart")
regr_rpart_subset <- regr_task$clone()
#look at how top 5 features from rforest would be plotted
regr_rpart_subset$select(names(head(xgboost_lrn_tuned$importance(),5)))
regr_rpart_lrn$train(regr_rpart_subset, row_ids = regr_splits$train)
regr_rpart_pred <- regr_rpart_lrn$predict(regr_rpart_subset,row_ids = regr_splits$test)
rpart.plot(regr_rpart_lrn$model)
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.plot with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.
### Test results
xgboost_prediction <- xgboost_lrn_tuned$predict(regr_task, row_ids = regr_splits$test)
xgboost_prediction$score(msrs(c("regr.mse","regr.rmse","regr.rsq","regr.mae")))
## regr.mse regr.rmse rsq regr.mae
## 6.692919e+07 8.181027e+03 4.108229e-01 3.773073e+03
xgboost_residuals <- xgboost_prediction$truth - xgboost_prediction$response
plot(xgboost_prediction$response, xgboost_prediction$truth,
main = "Actual vs. Predicted",
xlab = "Predicted Values",
ylab = "Actual Values",
pch =20,
cex = 0.8)
abline(0, 1) # Line of perfect prediction
hist(xgboost_residuals, main = "Histogram of Test Residuals",xlab = "Residuals", breaks = 40)
plot(x = xgboost_prediction$response, y = xgboost_residuals,
main = "Residuals vs. Predicted Values",
xlab = "Predicted Values",
ylab = "Residuals",
pch = 20,
cex = 0.8)
abline(h = 0)
Visualize prediction using important features
# Get the test data from the task
test_data <- regr_task$data(rows = regr_splits$test)
# get predictions as df
predictions_df <- data.frame(
truth = xgboost_prediction$truth,
response = xgboost_prediction$response
)
# Combine test data with predictions
combined_data <- dplyr::bind_cols(test_data, predictions_df)
ggplot(combined_data, aes(x = estimatedSkew, y = response)) +
geom_point()
ggplot(combined_data, aes(x = logMedianPlaytime, y = response)) +
geom_point()
Since the target recommendations is a count variable
that has variance a lot higher than the mean, a Negative Binomial would
be suitable for this problem. I will begin by first determining a subset
of features using a forward-stepwise selection algorithm, with AIC as
criterion of subset selection. The selected features would then be used
in the cross-validation process to estimate the performance measures of
the model with subset features.
# get training split for subset selection
nb_regr.full <- glm.nb(as.formula(recommendations ~ .), data = regr_task.train$data())
nb_regr.null <- glm.nb(as.formula(recommendations ~ 1), data = regr_task.train$data())
fw_nb_regr <- step(nb_regr.null,
scope = list(lower = nb_regr.null, upper = nb_regr.full),
direction = "forward",
k = 2,# AIC criterion
trace = FALSE) # suppress output
# summary of the selected model
nb_summary <- summary(fw_nb_regr)
print(nb_summary)
##
## Call:
## glm.nb(formula = recommendations ~ estimatedSkew + logMedianPlaytime +
## languagesCount + positivePct + X5 + price + X4 + X3 + clusterCount +
## age16 + X1 + dlcCount + X2, data = regr_task.train$data(),
## init.theta = 1.290444021, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.788520 0.133944 20.818 < 2e-16 ***
## estimatedSkew 1.651871 0.033759 48.931 < 2e-16 ***
## logMedianPlaytime 0.270035 0.014182 19.041 < 2e-16 ***
## languagesCount 0.042923 0.002539 16.904 < 2e-16 ***
## positivePct 2.482547 0.104810 23.686 < 2e-16 ***
## X5 1.928823 0.124934 15.439 < 2e-16 ***
## price 0.014065 0.001274 11.043 < 2e-16 ***
## X4 -0.663502 0.098910 -6.708 1.97e-11 ***
## X3 0.809634 0.092694 8.734 < 2e-16 ***
## clusterCount 0.134362 0.012206 11.008 < 2e-16 ***
## age16TRUE 0.388600 0.052082 7.461 8.57e-14 ***
## X1 -0.468189 0.124969 -3.746 0.000179 ***
## dlcCount 0.015627 0.003354 4.660 3.17e-06 ***
## X2 0.215163 0.086243 2.495 0.012601 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.2904) family taken to be 1)
##
## Null deviance: 14393.0 on 4475 degrees of freedom
## Residual deviance: 5023.8 on 4462 degrees of freedom
## AIC: 78491
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.2904
## Std. Err.: 0.0246
##
## 2 x log-likelihood: -78460.6080
# Calculate and print AIC of the final model
final_aic <- AIC(fw_nb_regr)
print(paste("Final AIC:", final_aic))
## [1] "Final AIC: 78490.6076292886"
The log link function is used for the Negative binomial distribution, where y = exp(intercept + B1X + … + BnXn), the dispersion parameter is estimated to be 1.23, which indicates that the assumption of overdispersion is fulfilled and preferred over a Poisson GLM.
Almost all predictor variables are statistically significant (p-values < 0.001) based on their associated z-values and significance codes (***). This suggests that they have a meaningful relationship with the response variable.
Positive Coefficients: Variables such as estimatedSkew, logMedianPlaytime, languagesCount, positivePct, price, X5, clusterCount, age16, logDlcCount, and X2 are associated with an increase in the expected count of recommendations. For example: a 1-unit increase in clusterCount is associated with a increase of exp(0.108) = 1.114, or around 11.4% increase in the expected count of recommendations.
Negative Coefficients: Variables such as X1, X8, and X6 have negative coefficients, indicating a decrease in the expected count.
Null Deviance: The deviance of the model with only the intercept is 13581.6.
Residual Deviance: The deviance of the fitted model with predictors is 5029.9. A large reduction from the null deviance which indicates that the model explains a significant portion of the variability in the data.
Plotting the coefficients
plot_summs(fw_nb_regr)
Look at how different predictors is used estimate
recommendations in the model (limited to below 25000)
effect_plot(fw_nb_regr, pred = estimatedSkew, interval = TRUE, plot.points = TRUE,
jitter = 0.05) + ylim(0, 10000)
## Warning: Removed 566 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_path()`).
effect_plot(fw_nb_regr, pred = logMedianPlaytime, interval = TRUE, plot.points = TRUE,
jitter = 0.05) + ylim(0, 10000)
## Warning: Removed 566 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_path()`).
effect_plot(fw_nb_regr, pred = positivePct, interval = TRUE, plot.points = TRUE,
jitter = 0.05) + ylim(0, 10000)
## Warning: Removed 566 rows containing missing values or values outside the scale range
## (`geom_point()`).
effect_plot(fw_nb_regr, pred = X5, interval = TRUE, plot.points = TRUE,
jitter = 0.05) + ylim(0, 10000)
## Warning: Removed 566 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_path()`).
effect_plot(fw_nb_regr, pred = X4, interval = TRUE, plot.points = TRUE,
jitter = 0.05) + ylim(0, 10000)
## Warning: Removed 566 rows containing missing values or values outside the scale range
## (`geom_point()`).
### Selected features
nb_selected_features <- names(coef(fw_nb_regr))[-1] # Exclude the intercept
print(nb_selected_features)
## [1] "estimatedSkew" "logMedianPlaytime" "languagesCount"
## [4] "positivePct" "X5" "price"
## [7] "X4" "X3" "clusterCount"
## [10] "age16TRUE" "X1" "dlcCount"
## [13] "X2"
# Replace "age16TRUE" with "age16"
nb_selected_features <- gsub("age16TRUE", "age16", nb_selected_features)
# Prepare formula for the selected features
nb_selected_formula <- as.formula(paste("recommendations ~", paste(nb_selected_features, collapse = " + ")))
nb_regr.final <- glm.nb(nb_selected_formula, data = regr_task.train$data())
Testing predicted results against observed values on new data.
# Subset the test data task object to include only the selected features
regr_task.test_nb <- regr_task$clone()
regr_task.test_nb$filter(rows = regr_splits$test)
regr_task.test_nb$select(nb_selected_features)
nb_predicted.test <- predict(nb_regr.final, newdata = regr_task.test_nb$data(), type = "response")
nb_actual.test <- regr_task.test_nb$data()$recommendations
test_rmse <- sqrt(mean((nb_actual.test - nb_predicted.test)^2))
test_mae <- mean(abs(nb_actual.test - nb_predicted.test))
cat("Test RMSE:", test_rmse, "\n",
"Test MAE:", test_mae)
## Test RMSE: 8437.562
## Test MAE: 3621.496
plot(nb_predicted.test, nb_actual.test,
main = "Actual vs. Predicted",
xlab = "Predicted Values",
ylab = "Actual Values",
pch =20,
cex = 0.8)
abline(0, 1) # Line of perfect prediction
test_residuals <- (nb_actual.test - nb_predicted.test)
nb_pearson_residuals <- (nb_actual.test - nb_predicted.test) / sqrt(nb_predicted.test + nb_predicted.test^2 / nb_regr.final$theta)
hist(test_residuals, main = "Histogram of Test Residuals", xlab = "Residual", breaks = 100)
plot(x = nb_predicted.test, y = test_residuals,
main = "Residuals vs. Predicted Values",
xlab = "Predicted Values",
ylab = "Residuals",
pch = 20,
cex = 0.8)
abline(h = 0)
hist(nb_pearson_residuals, breaks = 100, main = "Histogram of Pearson Residuals",xlab = "Pearson Residuals")
plot(x = nb_predicted.test, y = nb_pearson_residuals,
main = "Pearson Residuals vs. Predicted Values",
xlab = "Predicted Values",
ylab = "Pearson Residuals",
pch = 20,
cex = 0.8)
abline(h = 0)
### Classification Task
For the classification task, the target would be the ordinal
categorical priceRange variable, containing the category of
price range based on what price it is currently sold in the Steam
platform, where the range of prices are defined as: Low = 0-10, Medium =
11-20, and High = 21 or above.
table(games_df.filtered$priceRange)
##
## Low Medium High
## 2699 1823 1074
Since there are some class imbalance in the target variable, the training and testing partition would be split using a stratified sampling method, which ensures the training and testing partition will have a similar distribution as in the original data.
# all features used in classification tasks
classif.features <- left_join(games_df.filtered[,c('appID','priceRange','dlcCount', 'daysReleased', 'languagesCount', 'age16', 'industryLeader', 'selfPublished', 'aboveTwoHours', 'logMedianPlaytime', 'logRecommendations')], clusterWeight.wide, by = 'appID')
classif.features <- left_join(classif.features, cluster.count, by = 'appID') %>% dplyr::select(-appID)
# define task to store all features
classif_task = as_task_classif(classif.features,
target = "priceRange",
id = "classif task all")
# specify that the target and feature "age16" is stratified during split
classif_task$set_col_roles("priceRange", c("target", "stratum"))
classif_task$set_col_roles("age16", c("feature","stratum"))
# split train test partition
classif_splits <- partition(classif_task, ratio = 0.8)
print(classif_task)
## <TaskClassif:classif task all> (5596 x 18)
## * Target: priceRange
## * Properties: multiclass, strata
## * Features (17):
## - dbl (14): X1, X2, X3, X4, X5, X6, X7, aboveTwoHours, clusterCount,
## daysReleased, dlcCount, languagesCount, logMedianPlaytime,
## logRecommendations
## - lgl (3): age16, industryLeader, selfPublished
## * Strata: priceRange, age16
# load featureless learner
lrn_featureless = lrn("classif.featureless")
lrn_featureless$train(classif_task, classif_splits$train)
prediction_bas = lrn_featureless$predict(classif_task, classif_splits$test)
prediction_bas$confusion
## truth
## response Low Medium High
## Low 540 364 215
## Medium 0 0 0
## High 0 0 0
prediction_bas$score(msr("classif.ce"))
## classif.ce
## 0.5174263
Since the classes are imbalanced, I will define weights for different classes by assigning weights that are inversely proportional to the class frequencies. These weights will be used to weight the costs of misclassification of different classes to prevent bias in any of the classes.
class_counts = table(games_df.filtered$priceRange)
low_weight <- as.numeric(sum(class_counts) / (class_counts['Low'] * 3))
medium_weight <- as.numeric(sum(class_counts) / (class_counts['Medium'] * 3))
high_weight <- as.numeric(sum(class_counts) / (class_counts['High'] * 3))
cat('Weights: \n', 'Low: ', low_weight, '\nMedium: ', medium_weight, '\nHigh: ', high_weight)
## Weights:
## Low: 0.6911202
## Medium: 1.023222
## High: 1.736809
The Support Vector Machine (SVM) classifier identifies the optimal hyperplane (or decision boundary) that separates data points from different classes in high-dimensional space. This hyperplane is selected to maximize its distance from the nearest data points (support vectors), creating the widest possible margin. For predicting price ranges using game features and tag clusters, classes may not be linearly separable. The kernel trick addresses this by enlarging the feature space in a specific way, implicitly projecting data into a higher-dimensional space where separation is possible. The Support Vector Machine algorithm can then find the decision boundaries that maximize the margin between classes. Since I have three classes (Low, Medium, High), the algorithm would use a One vs One strategy by default, which would create 3(3-1)/2 = 3 binary classifiers: Low vs Medium, Low vs High, Medium vs High, and the final classification would be the class to which it was most frequently voted in these pairwise classifications.
Final tuned parameters result
# optimal hyperparameters from tuning
svm_ti$result$learner_param_vals
## [[1]]
## [[1]]$class.weights
## Low Medium High
## 0.6911202 1.0232218 1.7368094
##
## [[1]]$kernel
## [1] "radial"
##
## [[1]]$type
## [1] "C-classification"
##
## [[1]]$cost
## [1] 10
##
## [[1]]$gamma
## [1] 0.005994843
autoplot(svm_ti, type = "surface")
Visualize the hyperplanes using two selected variables.
# Subset the task to include only the two variables and the target
classif_task.plot <- classif_task$clone()
classif_task.plot$select(c("logMedianPlaytime", "daysReleased"))
# create tuned model object
svm_lrn_plot <- lrn("classif.svm")
svm_lrn_plot$param_set$values = svm_ti$result_learner_param_vals
# train tuned model with subset features
svm_lrn_plot$train(classif_task.plot)
# Plot decision boundaries using autoplot
autoplot(svm_lrn_plot, type = "prediction", task = classif_task.plot)
### Training the optimized model on entire train data
# create tuned model
svm_tuned <- lrn("classif.svm", predict_type = "prob")
svm_tuned$param_set$values = svm_ti$result_learner_param_vals
# train tuned model with whole training partition
svm_tuned$train(classif_task, classif_splits$train)
svm_tuned_prediction <- svm_tuned$predict(classif_task,
row_ids = classif_splits$test)
# autoplot(svm_tuned_prediction)
svm_tuned_prediction$confusion
## truth
## response Low Medium High
## Low 444 141 27
## Medium 83 174 76
## High 13 49 112
svm_tuned_prediction$score(msrs(c("classif.ce", "classif.acc")))
## classif.ce classif.acc
## 0.3476318 0.6523682
Calculating precision and Recall manually:
svm_confusion <- svm_tuned_prediction$confusion
low_precision <- svm_confusion[1,1] / sum(svm_confusion[1,1:3])
medium_precision <- svm_confusion[2,2] / sum(svm_confusion[2,1:3])
high_precision <- svm_confusion[3,3] / sum(svm_confusion[3,1:3])
low_recall <- svm_confusion[1,1] / sum(svm_confusion[1:3,1])
medium_recall <- svm_confusion[2,2] / sum(svm_confusion[1:3,2])
high_recall <- svm_confusion[3,3] / sum(svm_confusion[1:3,3])
# macro average
macro_precision <- mean(c(low_precision, medium_precision, high_precision))
macro_recall <- mean(c(low_recall, medium_recall, high_recall))
cat('Precision scores: \n',
paste('Low:', round(low_precision,2)),'\n',
paste('Medium:', round(medium_precision,2)),'\n',
paste('High:', round(high_precision,2)),'\n',
paste('Average Precision:', round(macro_precision,2)),'\n\n')
## Precision scores:
## Low: 0.73
## Medium: 0.52
## High: 0.64
## Average Precision: 0.63
cat('Recall scores: \n',
paste('Low:', round(low_recall,2)),'\n',
paste('Medium: ', round(medium_recall,2)),'\n',
paste('High:', round(high_recall,2)),'\n',
paste('Average Recall:', round(macro_recall,2)))
## Recall scores:
## Low: 0.82
## Medium: 0.48
## High: 0.52
## Average Recall: 0.61
Results have shown that this model performed well in predicting
Low and High classes, but suffers when
predicting the Medium class. This is expected, as games
priced between 11 and 20 often exhibit characteristics of both expensive
and inexpensive games, making predictions for this class more
challenging.
The final optimized parameters are determined based on the classification logloss or the cross-entropy measure, which quantifies the difference between predicted probabilities and actual outcomes. This measure is chosen because it accounts for how confident the mistakes are, making it suitable for data that has class imbalance.
Optimized tuning parameters:
rpart_classif_ti$result_learner_param_vals
## $class.weights
## Low Medium High
## 0.6911202 1.0232218 1.7368094
##
## $importance
## [1] "impurity"
##
## $num.threads
## [1] 1
##
## $max.depth
## [1] 30
##
## $min.node.size
## [1] 3
##
## $num.trees
## [1] 500
rforest_classif_tuned <- lrn("classif.ranger")
# train model with tuned hyperparameters
rforest_classif_tuned$param_set$values = rpart_classif_ti$result_learner_param_vals
rforest_classif_tuned$train(classif_task, classif_splits$train)
Ranking the feature’s importance based on the decrease in node impurity when a feature is used to split a node.
# Extract importance and convert to data.frame
rpart_importance <- data.frame(
feature = names(rforest_classif_tuned$importance()),
importance = unname(rforest_classif_tuned$importance())
)
# Sort features by importance (descending)
rpart_importance <- rpart_importance %>%
arrange(desc(importance))
print(rpart_importance)
## feature importance
## 1 logMedianPlaytime 367.78669
## 2 daysReleased 360.77438
## 3 logRecommendations 338.64965
## 4 X4 261.19174
## 5 X3 195.64741
## 6 X5 183.81999
## 7 languagesCount 174.38276
## 8 X1 167.31301
## 9 X7 163.57605
## 10 X6 145.16650
## 11 X2 140.89194
## 12 dlcCount 136.30984
## 13 clusterCount 74.31522
## 14 industryLeader 57.80119
## 15 selfPublished 39.71734
## 16 aboveTwoHours 21.12932
## 17 age16 20.30442
Plot importance
ggplot(rpart_importance, aes(x = reorder(feature, importance), y = importance)) +
geom_col(fill = "steelblue") + # Bar color
coord_flip() + # Horizontal bars for readability
labs(
title = "Feature Importance in Random Forest",
x = "Feature",
y = "Importance Score"
) +
theme_minimal()
classif_rpart_lrn <- lrn("classif.rpart")
classif_rpart_subset <- classif_task$clone()
#look at how top 5 features from rforest would be plotted
classif_rpart_subset$select(names(head(rforest_classif_tuned$importance(),5)))
classif_rpart_lrn$train(classif_rpart_subset, row_ids = classif_splits$train)
suppressWarnings(
rpart.plot(classif_rpart_lrn$model)
)
### Test results
rforest_classif_prediction <- rforest_classif_tuned$predict(classif_task, classif_splits$test)
rforest_classif_prediction$confusion
## truth
## response Low Medium High
## Low 448 142 18
## Medium 76 169 80
## High 16 53 117
rforest_classif_prediction$score(msrs(c("classif.ce", "classif.acc")))
## classif.ce classif.acc
## 0.3440572 0.6559428
Visualize prediction using important features
# Get the test data from the task
test_data <- classif_task$data(rows = classif_splits$test)
# get predictions as df
predictions_df <- data.frame(
truth = rforest_classif_prediction$truth,
response = rforest_classif_prediction$response
)
# Combine test data with predictions
combined_data <- dplyr::bind_cols(test_data, predictions_df)
ggplot(combined_data, aes(x = daysReleased, y = logMedianPlaytime, color = response)) +
geom_point() +
labs(color = "Predicted Price Range")
ggplot(combined_data, aes(x = logRecommendations, y = logMedianPlaytime, color = response)) +
geom_point() +
labs(color = "Predicted Price Range")
Precision and Recall scores for each class
rforest_confusion <- rforest_classif_prediction$confusion
low_precision <- rforest_confusion[1,1] / sum(rforest_confusion[1,1:3])
medium_precision <- rforest_confusion[2,2] / sum(rforest_confusion[2,1:3])
high_precision <- rforest_confusion[3,3] / sum(rforest_confusion[3,1:3])
low_recall <- rforest_confusion[1,1] / sum(rforest_confusion[1:3,1])
medium_recall <- rforest_confusion[2,2] / sum(rforest_confusion[1:3,2])
high_recall <- rforest_confusion[3,3] / sum(rforest_confusion[1:3,3])
# macro average
macro_precision <- mean(c(low_precision, medium_precision, high_precision))
macro_recall <- mean(c(low_recall, medium_recall, high_recall))
cat('Precision scores: \n'
,paste('Low:', round(low_precision,2)),'\n'
,paste('Medium:', round(medium_precision,2)),'\n'
,paste('High:', round(high_precision,2)),'\n'
,paste('Average Precision:', round(macro_precision,2)),'\n\n')
## Precision scores:
## Low: 0.74
## Medium: 0.52
## High: 0.63
## Average Precision: 0.63
cat('Recall scores: \n'
,paste('Low:', round(low_recall,2)),'\n'
,paste('Medium: ', round(medium_recall,2)),'\n'
,paste('High:', round(high_recall,2)),'\n'
,paste('Average Recall:', round(macro_recall,2)))
## Recall scores:
## Low: 0.83
## Medium: 0.46
## High: 0.54
## Average Recall: 0.61